/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.fetcher;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.logging.Logger;
import java.util.logging.Level;
import java.util.logging.Handler;
import net.nutch.util.LogFormatter;
/**
* This class handles the parsing of <code>robots.txt</code> files.
* It emits RobotRules objects, which describe the download permissions
* as described in RobotRulesParser.
*
* @author Tom Pierce, modified by Mike Cafarella
*/
public class RobotRulesParser {
public static final Logger LOG=
LogFormatter.getLogger("net.nutch.fetcher.RobotRulesParser");
private HashMap robotNames;
private static final String CHARACTER_ENCODING= "UTF-8";
private static final int NO_PRECEDENCE= Integer.MAX_VALUE;
private static final RobotRuleSet EMPTY_RULES= new RobotRuleSet();
/**
* This class holds the rules which were parsed from a robots.txt
* file, and can test paths against those rules.
*/
public static class RobotRuleSet {
ArrayList tmpEntries;
RobotsEntry[] entries;
long expireTime;
/**
*/
private class RobotsEntry {
String prefix;
boolean allowed;
RobotsEntry(String prefix, boolean allowed) {
this.prefix= prefix;
this.allowed= allowed;
}
}
/**
* should not be instantiated from outside RobotRulesParser
*/
private RobotRuleSet() {
tmpEntries= new ArrayList();
entries= null;
}
/**
*/
private void addPrefix(String prefix, boolean allow) {
if (tmpEntries == null) {
tmpEntries= new ArrayList();
if (entries != null) {
for (int i= 0; i < entries.length; i++)
tmpEntries.add(entries[i]);
}
entries= null;
}
tmpEntries.add(new RobotsEntry(prefix, allow));
}
/**
*/
private void clearPrefixes() {
if (tmpEntries == null) {
tmpEntries= new ArrayList();
entries= null;
} else {
tmpEntries.clear();
}
}
/**
* Change when the ruleset goes stale.
*/
public void setExpireTime(long expireTime) {
this.expireTime = expireTime;
}
/**
* Get expire time
*/
public long getExpireTime() {
return expireTime;
}
/**
* Returns <code>false</code> if the <code>robots.txt</code> file
* prohibits us from accessing the given <code>path</code>, or
* <code>true</code> otherwise.
*/
public boolean isAllowed(String path) {
try {
path= URLDecoder.decode(path, CHARACTER_ENCODING);
} catch (Exception e) {
// just ignore it- we can still try to match
// path prefixes
}
if (entries == null) {
entries= new RobotsEntry[tmpEntries.size()];
entries= (RobotsEntry[])
tmpEntries.toArray(entries);
tmpEntries= null;
}
int pos= 0;
int end= entries.length;
while (pos < end) {
if (path.startsWith(entries[pos].prefix))
return entries[pos].allowed;
pos++;
}
return true;
}
/**
*/
public String toString() {
isAllowed("x"); // force String[] representation
StringBuffer buf= new StringBuffer();
for (int i= 0; i < entries.length; i++)
if (entries[i].allowed)
buf.append("Allow: " + entries[i].prefix
+ System.getProperty("line.separator"));
else
buf.append("Disallow: " + entries[i].prefix
+ System.getProperty("line.separator"));
return buf.toString();
}
}
/**
* Creates a new <code>RobotRulesParser</code> which will use the
* supplied <code>robotNames</code> when choosing which stanza to
* follow in <code>robots.txt</code> files. Any name in the array
* may be matched. The order of the <code>robotNames</code>
* determines the precedence- if many names are matched, only the
* rules associated with the robot name having the smallest index
* will be used.
*/
public RobotRulesParser(String[] robotNames) {
this.robotNames= new HashMap();
for (int i= 0; i < robotNames.length; i++) {
this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
}
// always make sure "*" is included
if (!this.robotNames.containsKey("*"))
this.robotNames.put("*", new Integer(robotNames.length));
}
/**
* Returns a {@link RobotRuleSet} object which encapsulates the
* rules parsed from the supplied <code>robotContent</code>.
*/
RobotRuleSet parseRules(byte[] robotContent) {
if (robotContent == null)
return EMPTY_RULES;
String content= new String (robotContent);
StringTokenizer lineParser= new StringTokenizer(content, "\n\r");
RobotRuleSet bestRulesSoFar= null;
int bestPrecedenceSoFar= NO_PRECEDENCE;
RobotRuleSet currentRules= new RobotRuleSet();
int currentPrecedence= NO_PRECEDENCE;
boolean addRules= false; // in stanza for our robot
boolean doneAgents= false; // detect multiple agent lines
while (lineParser.hasMoreTokens()) {
String line= lineParser.nextToken();
// trim out comments and whitespace
int hashPos= line.indexOf("#");
if (hashPos >= 0)
line= line.substring(0, hashPos);
line= line.trim();
if ( (line.length() >= 11)
&& (line.substring(0, 11).equalsIgnoreCase("User-agent:")) ) {
if (doneAgents) {
if (currentPrecedence < bestPrecedenceSoFar) {
bestPrecedenceSoFar= currentPrecedence;
bestRulesSoFar= currentRules;
currentPrecedence= NO_PRECEDENCE;
currentRules= new RobotRuleSet();
}
addRules= false;
}
doneAgents= false;
String agentNames= line.substring(line.indexOf(":") + 1);
agentNames= agentNames.trim();
StringTokenizer agentTokenizer= new StringTokenizer(agentNames);
while (agentTokenizer.hasMoreTokens()) {
// for each agent listed, see if it's us:
String agentName= agentTokenizer.nextToken().toLowerCase();
Integer precedenceInt= (Integer) robotNames.get(agentName);
if (precedenceInt != null) {
int precedence= precedenceInt.intValue();
if ( (precedence < currentPrecedence)
&& (precedence < bestPrecedenceSoFar) )
currentPrecedence= precedence;
}
}
if (currentPrecedence < bestPrecedenceSoFar)
addRules= true;
} else if ( (line.length() >= 9)
&& (line.substring(0, 9).equalsIgnoreCase("Disallow:")) ) {
doneAgents= true;
String path= line.substring(line.indexOf(":") + 1);
path= path.trim();
try {
path= URLDecoder.decode(path, CHARACTER_ENCODING);
} catch (Exception e) {
LOG.warning("error parsing robots rules- can't decode path: "
+ path);
}
if (path.length() == 0) { // "empty rule"
if (addRules)
currentRules.clearPrefixes();
} else { // rule with path
if (addRules)
currentRules.addPrefix(path, false);
}
} else if ( (line.length() >= 6)
&& (line.substring(0, 6).equalsIgnoreCase("Allow:")) ) {
doneAgents= true;
String path= line.substring(line.indexOf(":") + 1);
path= path.trim();
if (path.length() == 0) {
// "empty rule"- treat same as empty disallow
if (addRules)
currentRules.clearPrefixes();
} else { // rule with path
if (addRules)
currentRules.addPrefix(path, true);
}
}
}
if (currentPrecedence < bestPrecedenceSoFar) {
bestPrecedenceSoFar= currentPrecedence;
bestRulesSoFar= currentRules;
}
if (bestPrecedenceSoFar == NO_PRECEDENCE)
return EMPTY_RULES;
return bestRulesSoFar;
}
/**
* Returns a <code>RobotRuleSet</code> object appropriate for use
* when the <code>robots.txt</code> file is empty or missing; all
* requests are allowed.
*/
static RobotRuleSet getEmptyRules() {
return EMPTY_RULES;
}
/**
* Returns a <code>RobotRuleSet</code> object appropriate for use
* when the <code>robots.txt</code> file is not fetched due to a
* <code>403/Forbidden</code> response; all requests are
* disallowed.
*/
static RobotRuleSet getForbidAllRules() {
RobotRuleSet rules= new RobotRuleSet();
rules.addPrefix("", false);
return rules;
}
private final static int BUFSIZE= 2048;
/** command-line main for testing */
public static void main(String[] argv) {
if (argv.length != 3) {
System.out.println("Usage:");
System.out.println(" java <robots-file> <url-file> <agent-name>+");
System.out.println("");
System.out.println("The <robots-file> will be parsed as a robots.txt file,");
System.out.println("using the given <agent-name> to select rules. URLs ");
System.out.println("will be read (one per line) from <url-file>, and tested");
System.out.println("against the rules.");
System.exit(-1);
}
try {
FileInputStream robotsIn= new FileInputStream(argv[0]);
LineNumberReader testsIn= new LineNumberReader(new FileReader(argv[1]));
String[] robotNames= new String[argv.length - 1];
for (int i= 0; i < argv.length - 2; i++)
robotNames[i]= argv[i+2];
ArrayList bufs= new ArrayList();
byte[] buf= new byte[BUFSIZE];
int totBytes= 0;
int rsize= robotsIn.read(buf);
while (rsize >= 0) {
totBytes+= rsize;
if (rsize != BUFSIZE) {
byte[] tmp= new byte[rsize];
System.arraycopy(buf, 0, tmp, 0, rsize);
bufs.add(tmp);
} else {
bufs.add(buf);
buf= new byte[BUFSIZE];
}
rsize= robotsIn.read(buf);
}
byte[] robotsBytes= new byte[totBytes];
int pos= 0;
for (int i= 0; i < bufs.size(); i++) {
byte[] currBuf= (byte[]) bufs.get(i);
int currBufLen= currBuf.length;
System.arraycopy(currBuf, 0, robotsBytes, pos, currBufLen);
pos+= currBufLen;
}
RobotRulesParser parser=
new RobotRulesParser(robotNames);
RobotRuleSet rules= parser.parseRules(robotsBytes);
System.out.println("Rules:");
System.out.println(rules);
System.out.println();
String testPath= testsIn.readLine().trim();
while (testPath != null) {
System.out.println( (rules.isAllowed(testPath) ?
"allowed" : "not allowed")
+ ":\t" + testPath);
testPath= testsIn.readLine();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}